Generating the CellDataSet object

# Rename bidirectionally promoted lncRNAs and antisense lncRNAs to
# 'lincRNAs'
fData(dat)$ccdsid <- gsub("antisense", "lincRNA", fData(dat)$ccdsid)
fData(dat)$ccdsid <- gsub("bidirectional_promoter_lncRNA", "lincRNA", fData(dat)$ccdsid)

# Calculate the mean copies per cell among all classes (Bulk) and draw the
# density plot for lincRNAs vs protein coding
dat.means <- detectGenes(dat, min_expr = 1)
dat.means <- dat.means[fData(dat.means)$num_cells_expressed >= 2]
fData(dat.means)$mean_cpc <- apply(exprs(dat.means), 1, mean)
fData(dat.means)$sd <- apply(exprs(dat.means), 1, sd)
fData(dat.means)$BCV <- (fData(dat.means)$sd/fData(dat.means)$mean_cpc)^2

tmp <- data.frame(gene_short_name = fData(dat.means)$gene_short_name, gene_type = fData(dat.means)$ccdsid, 
    mean_cpc = fData(dat.means)$mean_cpc, BCV = fData(dat.means)$BCV, num_cells_expresed = fData(dat.means)$num_cells_expressed)

dat_means <- subset(tmp, gene_type %in% c("protein_coding", "lincRNA"))

density.plot <- ggplot(dat_means) + geom_density(aes(x = log10(mean_cpc), color = gene_type)) + 
    scale_color_manual(values = c("red", "black")) + theme_bw()

density.plot

# Subset on lncRNAs and mRNAs
dat_lincRNA_sort <- subset(dat_means, gene_type %in% "lincRNA")
dat_mRNA_sort <- subset(dat_means, gene_type %in% "protein_coding")

# Number of lncRNAs
length(dat_lincRNA_sort$gene_short_name)
## [1] 339
# Number of mRNAs
length(dat_mRNA_sort$gene_short_name)
## [1] 14308
# Generate a boxplot on the batch data
dat_means$frac_cells_expressed <- dat_means$num_cells_expresed/length(rownames(pData(dat.means)))

boxplot <- ggplot(dat_means, aes(gene_type, frac_cells_expressed, fill = gene_type)) + 
    geom_boxplot(notch = T) + theme_bw() + scale_fill_manual(values = c("red", 
    "grey50"))

boxplot

# Generate empirical cumulative density function
ecdf <- ggplot(dat_means, aes(x = log10(BCV), color = gene_type)) + stat_ecdf(geom = "line") + 
    theme_bw() + scale_color_manual(values = c("red", "black"))

ecdf

# test dotplot
dot <- ggplot(dat_means, aes(x = log10(mean_cpc), y = frac_cells_expressed, 
    color = gene_type)) + geom_point() + facet_wrap(~gene_type) + theme_bw() + 
    scale_color_manual(values = c("red", "black"))

dot

dat.sort <- detectGenes(dat, min_expr = 1)
dat.sort <- dat.sort[fData(dat.sort)$num_cells_expressed >= 2]
fData(dat.sort)$mean_cpc_expressed_only <- esApply(dat.sort, 1, function(x) {
    tmp <- x[x > 0]
    mean(tmp, na.rm = T)
})
dat.sort <- subset(dat.sort, fData(dat.sort)$ccdsid %in% c("lincRNA", "protein_coding"))

expressed_only_density <- ggplot(fData(dat.sort), aes(x = log10(mean_cpc_expressed_only), 
    color = ccdsid)) + geom_density() + theme_bw() + scale_color_manual(values = c("red", 
    "black"))

expressed_only_density

# Density estimates
lnc.dens <- density(subset(dat_means, gene_type %in% "lincRNA")$mean_cpc)
log.lnc.dens <- density(log10(subset(dat_means, gene_type %in% "lincRNA")$mean_cpc))
PC.dens <- density(subset(dat_means, gene_type %in% "protein_coding")$mean_cpc)
log.PC.dens <- density(log10(subset(dat_means, gene_type %in% "protein_coding")$mean_cpc))

# Learn the distribution of lncRNAs
learned.lnc.den <- DiscreteDistribution(subset(dat_means, gene_type %in% "lincRNA")$mean_cpc)

# Create weighted probabilities on PC gene FPKM values from which to sample
PC.weights.on.lnc.D <- p(learned.lnc.den)(subset(dat_means, gene_type %in% "protein_coding")$mean_cpc, 
    lower.tail = FALSE)
PC.probs.on.lnc.D <- PC.weights.on.lnc.D/sum(PC.weights.on.lnc.D)

# Sample from PC genes to match lncRNA distribution
samp_PC <- sample(subset(dat_means, gene_type %in% "protein_coding")$mean_cpc, 
    replace = T, size = 339, prob = PC.probs.on.lnc.D)

# My Sampling function
mySample <- function(x, n, EmpDist) {
    w <- p(EmpDist)(x$mean_cpc, lower.tail = FALSE)
    probs <- w/sum(w)
    samp <- x[sample(nrow(x), replace = FALSE, size = n, prob = probs), ]
    return(samp)
}

# Check plot
plot(density(log10(mySample(subset(dat_means, gene_type %in% "protein_coding"), 
    n = 339, EmpDist = learned.lnc.den)$mean_cpc)), main = "Sampling mRNAs from the lncRNA expression distribution")
lines(density(log10(r(learned.lnc.den)(339))), col = "blue")
lines(log.lnc.dens, col = "red")
lines(log.PC.dens, col = "green")
legend(x = -0.5, y = 0.5, legend = c("lncRNA distribution", "Random draws from learned dist", 
    "Sampled mRNAs from learned dist"), col = c("red", "blue", "black"), lty = 1)

# Generate a subset of dat_means that is only mRNAs that fit the learned
# lncRNA expression distribution
mRNA.sample <- dat_means[dat_means$gene_short_name %in% sample(subset(dat_means, 
    gene_type %in% "protein_coding")$gene_short_name, replace = T, size = 339, 
    prob = PC.probs.on.lnc.D), ]

# change gene_type to sampled for sampled mRNAs and append it to dat_means
mRNA.sample$gene_type <- "sampled"
dat_sampled <- rbind(dat_means, mRNA.sample)

# Subset on TFs
TFs.sample <- subset(dat_means, dat_means$gene_short_name %in% TFs)
TFs.sample$gene_type <- "TFs"

# Add TFs to dat_sampled
dat_sampled <- rbind(dat_sampled, TFs.sample)

# Draw a new barplot
p1 <- ggplot(dat_sampled, aes(gene_type, frac_cells_expressed, fill = gene_type)) + 
    geom_boxplot(notch = T) + theme_bw() + scale_fill_manual(values = c("red", 
    "grey50", "darkblue", "#33CC00"))
p1

# Draw a new ecdf plot
p2 <- ggplot(dat_sampled, aes(x = log10(BCV), color = gene_type)) + stat_ecdf(geom = "line") + 
    theme_bw() + scale_color_manual(values = c("red", "black", "darkblue", "#33CC00"))
p2

# Draw a new density plot
p3 <- ggplot(dat_sampled, aes(x = log10(mean_cpc), color = gene_type)) + geom_density() + 
    theme_bw() + scale_color_manual(values = c("red", "black", "darkblue", "#33CC00"))
p3

# Draw a tSNE using sample mRNAs from above

subsetmRNA <- subset(dat, fData(dat)$gene_short_name %in% mRNA.sample$gene_short_name)

sampled.dat.tSNE <- Rtsne(t(round(vstExprs(subsetmRNA))), theta = 0.1, check_duplicates = F)

pData(subsetmRNA)$tSNE1_pos <- sampled.dat.tSNE$Y[, 1]
pData(subsetmRNA)$tSNE2_pos <- sampled.dat.tSNE$Y[, 2]

q <- ggplot(pData(subsetmRNA))

sampled_tSNE <- q + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, color = group_num)) + 
    theme_bw() + coord_equal(1) + scale_color_brewer(palette = "Set1")

sampled_tSNE

# There doesn't seem to be any more or less organization present here. May
# be worth bootstrapping and quantfitying how well each of the tSNEs cluster
# the data points.
# tSNE using all genes
dat.tSNE <- Rtsne(t(round(vstExprs(dat))), theta = 0.1)

pData(dat)$tSNE1_pos <- dat.tSNE$Y[, 1]
pData(dat)$tSNE2_pos <- dat.tSNE$Y[, 2]

p <- ggplot(pData(dat))

tSNE_cluster <- p + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, color = group_num)) + 
    theme_bw() + coord_equal(1) + scale_color_brewer(palette = "Set1")

tSNE_cluster

tSNE_level2class <- p + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, color = level2class)) + 
    theme_bw() + coord_equal(1) + scale_color_manual(values = colorRampPalette(brewer.pal(9, 
    "Set1"))(length(unique(pData(tmp)$level2class))))

# tSNE using only protein coding genes
pcgenes <- subset(dat, fData(dat)$ccdsid %in% "protein_coding")

protein_coding.dat.tSNE <- Rtsne(t(round(vstExprs(pcgenes))), theta = 0.1, check_duplicates = F)

pData(pcgenes)$tSNE1_pos <- protein_coding.dat.tSNE$Y[, 1]
pData(pcgenes)$tSNE2_pos <- protein_coding.dat.tSNE$Y[, 2]

q <- ggplot(pData(pcgenes))

protein_coding_tSNE <- q + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, color = group_num)) + 
    theme_bw() + coord_equal(1) + scale_color_brewer(palette = "Set1")

protein_coding_tSNE

# tSNE using only lncRNAs
lncgenes <- subset(dat, fData(dat)$ccdsid %in% "lincRNA")

lincRNA.dat.tSNE <- Rtsne(t(round(vstExprs(lncgenes))), theta = 0.1, check_duplicates = F)

pData(lncgenes)$tSNE1_pos <- lincRNA.dat.tSNE$Y[, 1]
pData(lncgenes)$tSNE2_pos <- lincRNA.dat.tSNE$Y[, 2]

q1 <- ggplot(pData(lncgenes))

# tSNE using only lncRNAs color by cluster number (as determined by
# Linnarsson Lab)
lincRNA_tSNE <- q1 + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, color = group_num)) + 
    theme_bw() + coord_equal(1) + scale_color_brewer(palette = "Set1")

lincRNA_tSNE

# tSNE using only lncRNAs colored by level1class (as determined by
# Linnarsson Lab)
lincRNA_level1class_tSNE <- q1 + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, 
    color = level1class)) + theme_bw() + coord_equal(1) + scale_color_brewer(palette = "Set1")

lincRNA_level1class_tSNE

# tSNE using only lncRNAs color by level2clss
lincRNA_tSNE_level2class <- q1 + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, 
    color = level2class)) + theme_bw() + coord_equal(1) + scale_color_manual(values = colorRampPalette(brewer.pal(9, 
    "Set1"))(length(unique(pData(dat)$level2class))))

lincRNA_tSNE_level2class

# Color by Sex, size is reflective of XIST expression
lincRNA_tSNE_sex <- q1 + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, color = sex, 
    size = exprs(lncgenes)[2, ])) + scale_size("Xist") + theme_bw() + coord_equal(1) + 
    scale_fill_brewer(palette = "Set1")

lincRNA_tSNE_sex

# Color by infered batch (utililze the run number from cell_id in order to
# infer batch)
pData(lncgenes)$cell_id <- colnames(exprs(lncgenes))
pData(lncgenes)$batch <- str_split_fixed(pData(lncgenes)$cell_id, "_", 2)[, 
    1]

j <- ggplot(pData(lncgenes))

lincRNA_tSNE_batch <- j + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, color = batch)) + 
    theme_bw() + coord_equal(1) + scale_color_manual(values = colorRampPalette(brewer.pal(9, 
    "Set1"))(length(unique(pData(lncgenes)$batch))))

lincRNA_tSNE_batch

# tSNE by lincRNA without Xist and Tsix (we do this because Xist and Tsix
# expression splits the cells into two groups by sex)
sex_blind.dat <- subset(dat, fData(dat)$ccdsid %in% "lincRNA")
sex_blind_list <- as.vector(rownames(exprs(sex_blind.dat)[c(-2, -6), ]))
sex_blind.dat <- subset(sex_blind.dat, rownames(exprs(sex_blind.dat)) %in% sex_blind_list)

sex_blind_lincRNA.tSNE <- Rtsne(t(round(vstExprs(sex_blind.dat))), theta = 0.1, 
    check_duplicates = F)

pData(sex_blind.dat)$tSNE1_pos <- sex_blind_lincRNA.tSNE$Y[, 1]
pData(sex_blind.dat)$tSNE2_pos <- sex_blind_lincRNA.tSNE$Y[, 2]

n <- ggplot(pData(sex_blind.dat))

sex_blind_tSNE <- n + geom_point(aes(x = tSNE1_pos, y = tSNE2_pos, color = group_num)) + 
    theme_bw() + coord_equal(1) + scale_color_brewer(palette = "Set1")

sex_blind_tSNE

Seperate the “dat” CellDataSet by “Cluster” and calculate mean expression of genes

# Seperate the 'dat' CellDataSet by 'Cluster' and calculate mean expression
# of genes
Cluster.split <- lapply(unique(pData(dat)$group_num), function(x) {
    dat[, pData(dat)$group_num == x]
})

Cluster.split <- lapply(c(1:length(Cluster.split)), function(x) {
    detectGenes(Cluster.split[[x]], min_expr = 1)
})

Cluster.split <- lapply(c(1:length(Cluster.split)), function(i) {
    x <- Cluster.split[[i]]
    x[fData(x)$num_cells_expressed >= 2]
})

Cluster.split <- lapply(Cluster.split, function(x) {
    mean_cpc <- apply(exprs(x), 1, mean)
    fData(x)$mean_cpc <- mean_cpc
    return(x)
})

Cluster.split <- lapply(c(1:length(Cluster.split)), function(i) {
    x <- Cluster.split[[i]]
    frac_cells_expressed <- (fData(x)$num_cells_expressed/length(rownames(pData(x))))
    fData(x)$frac_cells_expressed <- frac_cells_expressed
    return(x)
})

tmp <- data.frame()

group_means <- lapply(c(1:length(Cluster.split)), function(i) {
    x <- Cluster.split[[i]]
    res <- data.frame(gene_short_name = fData(x)$gene_short_name, gene_type = fData(x)$ccdsid, 
        mean_cpc = fData(x)$mean_cpc, group_num = unique(pData(x)$group_num), 
        frac_cells_expressed = fData(x)$frac_cells_expressed)
    tmp <- rbind(tmp, res)
})

tmp <- plyr::ldply(group_means, data.frame)

group_means <- subset(tmp, gene_type %in% c("protein_coding", "lincRNA"))

density.plot_Cluster <- ggplot(group_means) + geom_density(aes(x = log10(mean_cpc), 
    color = gene_type)) + facet_grid(. ~ group_num, labeller = labeller(group_num = function(x) {
    paste("Cluster", x, sep = ":")
})) + scale_color_manual(values = c("red", "black")) + theme_bw()

density.plot_Cluster

# Generate box plots
boxplot_cluster <- ggplot(group_means, aes(gene_type, frac_cells_expressed, 
    fill = gene_type))

boxplot_cluster + geom_boxplot() + theme_bw() + scale_fill_manual(values = c("red", 
    "grey50")) + facet_grid(~group_num, labeller = labeller(group_num = function(x) {
    paste("Cluster", x, sep = ":")
}))

# pointplot
group_means1 <- data.table(group_means, key = "gene_short_name")
group_means1 <- group_means1[, .SD[mean_cpc %in% max(mean_cpc)], by = gene_short_name]

group_means2 <- data.table(group_means, key = "gene_short_name")
group_means2 <- group_means2[, .SD[frac_cells_expressed %in% max(frac_cells_expressed) & 
    mean_cpc %in% max(mean_cpc)], by = gene_short_name]

adjusted_point <- ggplot(group_means2, aes(x = log10(mean_cpc), y = frac_cells_expressed, 
    color = gene_type)) + geom_point() + facet_wrap(~gene_type) + theme_bw() + 
    scale_color_manual(values = c("red", "black"))

adjusted_point

Seperate the “dat” CellDataSet by level2class and calculate mean expression of genes

# Seperate the 'dat' CellDataSet by the Interneuron level2classes and
# calculate mean_cpc within this new object
level2.split <- lapply(unique(pData(dat)[pData(dat)$level1class == "interneurons", 
    ]$level2class), function(x) {
    dat[, pData(dat)$level2class == x]
})

level2.split <- lapply(c(1:length(level2.split)), function(x) {
    detectGenes(level2.split[[x]], min_expr = 1)
})

level2.split <- lapply(c(1:length(level2.split)), function(i) {
    x <- level2.split[[i]]
    x[fData(x)$num_cells_expressed >= 2]
})

level2.split <- lapply(level2.split, function(x) {
    mean_cpc <- apply(exprs(x), 1, mean)
    fData(x)$mean_cpc <- mean_cpc
    return(x)
})

tmp <- data.frame()

group_means_level2class <- lapply(c(1:length(level2.split)), function(i) {
    x <- level2.split[[i]]
    res <- data.frame(gene_short_name = fData(x)$gene_short_name, gene_type = fData(x)$ccdsid, 
        mean_cpc = fData(x)$mean_cpc, level2class = unique(pData(x)$level2class))
    tmp <- rbind(tmp, res)
})

tmp <- plyr::ldply(group_means_level2class, data.frame)

group_means_level2class <- subset(tmp, gene_type %in% c("protein_coding", "lincRNA"))

density.plot_level2class <- ggplot(group_means_level2class) + geom_density(aes(x = log10(mean_cpc), 
    color = gene_type)) + facet_wrap(~level2class, nrow = 2) + scale_color_manual(values = c("red", 
    "black")) + theme_bw()

density.plot_level2class

# Generate a barplot that has all of the subcell types for each of the 9
# clusters (again as determined by the Linnarsson group)
level_2_barplot <- lapply(c(1:9), function(x) {
    pData(Cluster.split[[x]])$level2class %>% unique() %>% length()
})
level_2_barplot <- data.frame(level_2_barplot)
colnames(level_2_barplot) <- c("1", "2", "3", "4", "5", "6", "7", "8", "9")
rownames(level_2_barplot) <- "Number_of_Clusters"
level_2_barplot <- as.data.frame(level_2_barplot)

level_2_barplot <- as.data.frame(t(level_2_barplot))

bar.plot <- ggplot(level_2_barplot)

bar.plot + geom_bar(aes(x = rownames(level_2_barplot), y = level_2_barplot$Number_of_Clusters, 
    fill = rownames(level_2_barplot)), colour = "black", width = 0.5, stat = "identity") + 
    scale_fill_brewer(palette = "Set1", guide = guide_legend(title = "Cluster #")) + 
    xlab("Cluster #") + ylab("Number of cellular subtypes") + theme_bw() + theme(axis.text.x = element_blank())

# Completely split the CellDataSet by level2class and calculate the mean_cpc
# in each of the new CellDataSet objects
complete.level2.split <- lapply(unique(pData(dat)$level2class), function(x) {
    dat[, pData(dat)$level2class == x]
})

complete.level2.split <- lapply(c(1:length(unique(pData(dat)$level2class))), 
    function(x) {
        detectGenes(complete.level2.split[[x]], min_expr = 1)
    })

complete.level2.split <- lapply(c(1:length(unique(pData(dat)$level2class))), 
    function(i) {
        x <- complete.level2.split[[i]]
        x[fData(x)$num_cells_expressed >= 2]
    })

complete.level2.split <- lapply(complete.level2.split, function(x) {
    mean_cpc <- apply(exprs(x), 1, mean)
    fData(x)$mean_cpc <- mean_cpc
    return(x)
})

complete.level2.split <- lapply(c(1:length(unique(pData(dat)$level2class))), 
    function(i) {
        x = complete.level2.split[[i]]
        fData(x)$gene_std_pc <- rowSds(exprs(x))
        return(x)
    })

complete.level2.split <- lapply(c(1:length(unique(pData(dat)$level2class))), 
    function(i) {
        x = complete.level2.split[[i]]
        fData(x)$BCV <- as.vector(fData(x)$gene_std_pc)/as.vector(fData(x)$mean_cpc)
        return(x)
    })

complete.level2.split <- lapply(c(1:length(unique(pData(dat)$level2class))), 
    function(i) {
        x = complete.level2.split[[i]]
        fData(x)$frac_cells_expressed <- fData(x)$num_cells_expressed/length(rownames(pData(x)))
        return(x)
    })

tmp <- data.frame()

bcv_level2class <- lapply(c(1:length(complete.level2.split)), function(i) {
    x <- complete.level2.split[[i]]
    res <- data.frame(gene_short_name = fData(x)$gene_short_name, gene_type = fData(x)$ccdsid, 
        mean_cpc = fData(x)$mean_cpc, BCV = fData(x)$BCV, gene_sd_pc = fData(x)$gene_std_pc, 
        level2class = unique(pData(x)$level2class), frac_cells_expressed = fData(x)$frac_cells_expressed)
    tmp <- rbind(tmp, res)
})

names(bcv_level2class) <- unique(pData(dat)$level2class)

level2_bcv <- Reduce(function(...) merge(..., all = TRUE), bcv_level2class)
level2_bcv <- subset(level2_bcv, gene_type %in% c("lincRNA", "protein_coding"))

# Generate density plots
density.plot_all_level2class <- ggplot(level2_bcv) + geom_density(aes(x = log10(mean_cpc), 
    color = gene_type)) + facet_wrap(~level2class, nrow = 8) + scale_color_manual(values = c("red", 
    "black")) + theme_bw()

density.plot_all_level2class

# Generate box plots
boxplot_level2Class <- ggplot(level2_bcv, aes(gene_type, frac_cells_expressed, 
    fill = gene_type))
boxplot_level2Class + geom_boxplot() + theme_bw() + scale_fill_manual(values = c("red", 
    "grey50")) + facet_wrap(~level2class, nrow = 8)

# generate point plots
level2_bcv1 <- data.table(level2_bcv, key = "gene_short_name")
level2_bcv1 <- level2_bcv1[, .SD[frac_cells_expressed %in% max(frac_cells_expressed) & 
    mean_cpc %in% max(mean_cpc)], by = gene_short_name]

level2class_adjusted_point <- ggplot(level2_bcv1, aes(x = log10(mean_cpc), y = frac_cells_expressed, 
    color = gene_type)) + geom_point() + facet_wrap(~gene_type) + theme_bw() + 
    scale_color_manual(values = c("red", "black"))

level2class_adjusted_point

# sanity test density plot
q1 <- ggplot(level2_bcv1, aes(x = log10(mean_cpc), color = gene_type)) + geom_density() + 
    theme_bw() + scale_color_manual(values = c("red", "black"))
# Some odds and ends that I may want to come back and use later.
cluster.bcv.dat <- lapply(unique(pData(dat)$group_num), function(x) {
    dat[, pData(dat)$group_num == x]
})

cluster.bcv.dat <- lapply(c(1:length(cluster.bcv.dat)), function(x) {
    detectGenes(cluster.bcv.dat[[x]], min_expr = 1)
})

cluster.bcv.dat <- lapply(c(1:length(cluster.bcv.dat)), function(i) {
    x <- cluster.bcv.dat[[i]]
    x[fData(x)$num_cells_expressed >= 2]
})

cluster.bcv.dat <- lapply(cluster.bcv.dat, function(x) {
    mean_cpc <- apply(exprs(x), 1, mean)
    fData(x)$mean_cpc <- mean_cpc
    return(x)
})

cluster.bcv.dat <- lapply(c(1:length(cluster.bcv.dat)), function(i) {
    x = cluster.bcv.dat[[i]]
    fData(x)$gene_std_pc <- rowSds(exprs(x))
    return(x)
})

cluster.bcv.dat <- lapply(c(1:length(unique(cluster.bcv.dat))), function(i) {
    x = cluster.bcv.dat[[i]]
    fData(x)$BCV <- as.vector(fData(x)$gene_std_pc)/as.vector(fData(x)$mean_cpc)
    return(x)
})

tmp <- data.frame()

bcv_cluster <- lapply(c(1:length(cluster.bcv.dat)), function(i) {
    x <- cluster.bcv.dat[[i]]
    res <- data.frame(gene_short_name = fData(x)$gene_short_name, gene_type = fData(x)$ccdsid, 
        mean_cpc = fData(x)$mean_cpc, BCV = fData(x)$BCV, gene_sd_pc = fData(x)$gene_std_pc, 
        cluster_num = rep(i))
    tmp <- rbind(tmp, res)
})

bcv <- Reduce(function(...) merge(..., all = TRUE), bcv_cluster)

linc_bcv <- subset(bcv, bcv$gene_type == "lincRNA")
mRNA_bcv <- subset(bcv, bcv$gene_type == "protein_coding")

delta_median_by_cluster <- lapply(c(1:9), function(x) {
    median(subset(mRNA_bcv, mRNA_bcv$cluster_num == x)$mean_cpc) - median(subset(linc_bcv, 
        linc_bcv$cluster_num == x)$mean_cpc)
})

names(delta_median_by_cluster) <- c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", 
    "Cluster 5", "Cluster 6", "Cluster 7", "Cluster 8", "Cluster 9")

# Generate a scatter plot of Δ median copies per cell (mRNA-lncRNA) for each
# cluster
delta_median_by_cluster <- as.matrix(delta_median_by_cluster)
delta_median_by_cluster <- as.data.frame(delta_median_by_cluster)
colnames(delta_median_by_cluster) <- "delta_median_mean_cpc"
delta_median_by_cluster$delta_median_mean_cpc <- as.numeric(delta_median_by_cluster$delta_median_mean_cpc)

delta_median_by_cluster$Number_of_subtypes <- as.vector(level_2_barplot$Number_of_Clusters)

p <- ggplot(delta_median_by_cluster, aes(x = Number_of_subtypes, y = delta_median_mean_cpc))

scatter.plot <- p + geom_point(aes(color = rownames(delta_median_by_cluster))) + 
    geom_smooth(aes(alpha = 0.1), method = "lm", se = T, color = "black", alpha = 0.2) + 
    scale_color_brewer(palette = "Set1", guide = guide_legend(title = "Cluster #")) + 
    guides(fill = "none") + theme_bw() + xlab("Number of cellular subtypes") + 
    ylab("Δ median copies per cell (mRNA-lncRNA)") + theme(legend.position = "none")

scatter.plot

# generate ecdf plots
subset_bcv <- subset(bcv, bcv$gene_type %in% c("lincRNA", "protein_coding"))
ecdf_cluster <- ggplot(subset_bcv, aes(x = log10(BCV), color = gene_type)) + 
    stat_ecdf(geom = "line") + facet_wrap(~cluster_num, nrow = 1) + scale_color_manual(values = c("red", 
    "black")) + theme_bw()

ecdf_cluster

Session Info

sessionInfo()
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Sierra 10.12.3
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] splines   stats4    parallel  stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] bios2mds_1.2.2           rgl_0.98.1              
##  [3] cluster_2.0.5            scales_0.4.1            
##  [5] e1071_1.6-8              amap_0.8-14             
##  [7] data.table_1.10.0        distr_2.6               
##  [9] SweaveListingUtils_0.7.5 sfsmisc_1.1-0           
## [11] startupmsg_0.9.3         RColorBrewer_1.1-2      
## [13] GenomicFeatures_1.26.2   AnnotationDbi_1.36.0    
## [15] GenomicRanges_1.26.2     GenomeInfoDb_1.10.2     
## [17] IRanges_2.8.1            S4Vectors_0.12.1        
## [19] tidyr_0.6.1              matrixStats_0.51.0      
## [21] magrittr_1.5             dplyr_0.5.0             
## [23] gapmap_0.0.4             reshape2_1.4.2          
## [25] Rtsne_0.11               monocle_2.2.0           
## [27] DDRTree_0.1.4            irlba_2.1.2             
## [29] VGAM_1.0-2               ggplot2_2.2.1           
## [31] Biobase_2.34.0           BiocGenerics_0.20.0     
## [33] Matrix_1.2-7.1           stringr_1.1.0           
## 
## loaded via a namespace (and not attached):
##  [1] jsonlite_1.2               shiny_0.14.2              
##  [3] assertthat_0.1             Rsamtools_1.26.1          
##  [5] yaml_2.1.14                slam_0.1-40               
##  [7] RSQLite_1.1-2              backports_1.0.4           
##  [9] lattice_0.20-34            limma_3.30.7              
## [11] digest_0.6.11              XVector_0.14.0            
## [13] colorspace_1.3-2           httpuv_1.3.3              
## [15] fastICA_1.2-0              htmltools_0.3.5           
## [17] plyr_1.8.4                 XML_3.98-1.5              
## [19] pheatmap_1.0.8             HSMMSingleCell_0.108.0    
## [21] qlcMatrix_0.9.5            biomaRt_2.30.0            
## [23] zlibbioc_1.20.0            xtable_1.8-2              
## [25] BiocParallel_1.8.1         tibble_1.2                
## [27] combinat_0.0-8             SummarizedExperiment_1.4.0
## [29] lazyeval_0.2.0             mime_0.5                  
## [31] memoise_1.0.0              evaluate_0.10             
## [33] MASS_7.3-45                class_7.3-14              
## [35] tools_3.3.2                formatR_1.4               
## [37] munsell_0.4.3              Biostrings_2.42.1         
## [39] grid_3.3.2                 RCurl_1.95-4.8            
## [41] htmlwidgets_0.8            igraph_1.0.1              
## [43] labeling_0.3               bitops_1.0-6              
## [45] rmarkdown_1.3              gtable_0.2.0              
## [47] DBI_0.5-1                  R6_2.2.0                  
## [49] GenomicAlignments_1.10.0   knitr_1.15.1              
## [51] rtracklayer_1.34.1         rprojroot_1.1             
## [53] stringi_1.1.2              Rcpp_0.12.8